##################################################
#
# Government 2001 Research Paper
# 
# Recession or not, here we come:
# An investigation of the effect of the Great Recession on college attendance by family income 
# 
# John Hansen, Ryosuke Kobayashi, and Aaron Orzech
# 5/05/2014
# 
# Table of Contents
# 0) Packages
# 1) Load Data
# 2) Data Clean-up
# 3) Create subsets for different models
# 4) Multinomial Models
#    4-1) Table 1 Models
#    4-2) Table 2 Models
# 5) Simulations
#    5-1) Model (5) Linear Year Interaciton Model
#    5-2) Model (4) Year-dummy Model
# 6) Robustness Checks
#    * Model (4) with log_income z scores
#
##################################################

rm(list=ls())

##################################################
# 0) packages
##################################################

#install.packages("mlogit")
library(mlogit)
library(foreign)
library(plyr)
library(MASS)


##################################################
# 1) load data
##################################################

# set the workplace
setwd("~/Desktop/2013-14 Spring/GOV 2001 Quantitative Methods/Replication/Final Data")
# list.files()

# load dataset
cps <- foreign::read.dta("cps_00015.dta")
head(cps)


##################################################
# 2) clean and organize data
##################################################

# keep only: basic college enrollment data and demographic data
subset <- c("year", "serial", "statefip", "pernum", "wtsupp", "PERWT04", "sex", "race", "schlcoll", "ftotval")
data <- cps[subset]

# fix 2004 weights
data$wtsupp[data$year == 2004] <- data$PERWT04[data$year == 2004]
data$schlcoll2 <- as.numeric(data$schlcoll)

# create trichotomous outcome: multi_coll
data[data[,"schlcoll2"]==6,"multi_coll"] <- 0  # not enrolled in college
data[data[,"schlcoll2"]==5,"multi_coll"] <- 1  # enrolled pt (part-time) in college
data[data[,"schlcoll2"]==4,"multi_coll"] <- 2  # enrolled ft (full-time) college
data <- data[ which(data[,"multi_coll"]==0|data[,"multi_coll"]==1|data[,"multi_coll"]==2),]  # get rid of hs students & folks not in sample universe
data[, "multi_coll"] <- as.factor(data[,"multi_coll"])

# income to same scale
data[,"inc_99"] <- 0
data[data$year == 2001, "inc_99"] <- data[data$year == 2001, "ftotval"] *.94073
data[data$year == 2002, "inc_99"] <- data[data$year == 2002, "ftotval"] *.92593
data[data$year == 2003, "inc_99"] <- data[data$year == 2003, "ftotval"] *.90580
data[data$year == 2004, "inc_99"] <- data[data$year == 2004, "ftotval"] *.88183
data[data$year == 2005, "inc_99"] <- data[data$year == 2005, "ftotval"] *.85324
data[data$year == 2006, "inc_99"] <- data[data$year == 2006, "ftotval"] *.82645
data[data$year == 2007, "inc_99"] <- data[data$year == 2007, "ftotval"] *.80321
data[data$year == 2008, "inc_99"] <- data[data$year == 2008, "ftotval"] *.77399
data[data$year == 2009, "inc_99"] <- data[data$year == 2009, "ftotval"] *.77700
data[data$year == 2010, "inc_99"] <- data[data$year == 2010, "ftotval"] *.76394
data[data$year == 2011, "inc_99"] <- data[data$year == 2011, "ftotval"] *.74074
data[data$year == 2012, "inc_99"] <- data[data$year == 2012, "ftotval"] *.72621
data[data$year == 2013, "inc_99"] <- data[data$year == 2013, "ftotval"] *.70000

# Adjusting Income to 2010
data$adj_inc <- 1.309*data$inc_99

# replace income < 0 with 0
data$adj_inc[data$adj_inc < 0] <- 0

# add $1 to all so i can take the log
data$adj_inc <- data$adj_inc + 1

# look at new data
# length(data$adj_inc) # 39385 observations left
# summary(data$adj_inc)

# log of income
data[,"log_inc"] <- log(data[,"adj_inc"])

# generate recenter year variance. 2001 = 0. 2002 = 1. 2003 = 2....etc.
data[,"rec_year"] <- data[,"year"] - 2001

# generate interaction term
data[,"log_inc_x_year"] <- data[,"log_inc"] * data[,"rec_year"]

# recode race variable to collapse most subgroups into "other" category (recoding rationale less arbitary; detailed rationale/sensitivity tests omitted)
data$race2 <- as.numeric(data$race)
data[data[,"race2"]>9&data[,"race2"]<29,"race2"] <- 998
levels(data$race) <- c(levels(data$race), "Other - Recoded")
data$race[data$race2 > 9] <- "Other - Recoded"


##################################################
# 3) create subsets for different models
##################################################

# subset for income of over $8103 (2010$) and income > 0
data1 <- data[data[,"log_inc"] > 9,]
data2 <- data[data[,"log_inc"] > 0,]

# convert data into mlogit format (shape = wide)
dd_all <- mlogit.data(data, shape = "wide", choice = "multi_coll")
dd_pos <- mlogit.data(data2, shape = "wide", choice = "multi_coll")
dd <- mlogit.data(data1, shape = "wide", choice = "multi_coll")


##################################################
# 4) tables 1 and 2: multinomial models
##################################################

##################################################
# 4-1) tables 1  models
#      log_income interacted with linear year term
##################################################

# (1) all income
model_all <- mlogit(formula = multi_coll ~ 1|log_inc + rec_year + log_inc_x_year + factor(sex) + factor(race) | 0, weights = wtsupp,data=dd_all, method = "nr", print.level = 0)
# summary(model_all)

# (2) income > $0
model_pos <- mlogit(formula = multi_coll ~ 1|log_inc + rec_year + log_inc_x_year + factor(sex) + factor(race) | 0, weights = wtsupp,data=dd_pos, method = "nr", print.level = 0)
# summary(model_pos)

# (3) income > $8,103
model <- mlogit(formula = multi_coll ~ 1|log_inc + rec_year + log_inc_x_year + factor(sex) + factor(race) | 0, weights = wtsupp,data=dd, method = "nr", print.level = 0)
# summary(model)

rm(data2, dd_all, dd_pos, model_all, model_pos)


##################################################
# 4-2) tables 2 models
#      other model specifications
#      data: dd - (3) income > $8103 listed above
##################################################

# (1) year dummies
model_yd <- mlogit(formula = multi_coll ~ 1| factor(year) + factor(sex) + factor(race) | 0, weights = wtsupp,data=dd, method = "nr", print.level = 0)
# summary(model_yd)

# (2) linear year
model_ly <- mlogit(formula = multi_coll ~ 1| rec_year + factor(sex) + factor(race) | 0, weights = wtsupp,data=dd, method = "nr", print.level = 0)
# summary(model_ly)

# (3) add log income
model_linc <- mlogit(formula = multi_coll ~ 1|log_inc + rec_year + factor(sex) + factor(race) | 0, weights = wtsupp,data=dd, method = "nr", print.level = 0)
# summary(model_linc)

# (4) add interaction with year dummies
model_linc2 <- mlogit(formula = multi_coll ~ 1|log_inc + rec_year + log_inc:factor(year) + factor(sex) + factor(race) | 0, weights = wtsupp,data=dd, method = "nr", print.level = 0)
# summary(model_linc2)

# (5) log_income interacted with linear year term
# summary(model)


##################################################
# 5) simulations
##################################################

##################################################
# 5-1) simulation using linear year interaciton model
#      model: (5) -- line 159
##################################################
#
# # checking its coef and standard error
# names(model)
# beta <- coef(model)               # coefficients (betas)
# nbetas <- length(beta)/2          # number of betas
# varcov <- -solve(hessian(model))  # variance-covariance matrix
# se <- sqrt(diag(varcov))          # standard error
#
# # simulation
# set.seed(02138)
# sims <- 10000
# simbetas <- mvrnorm(sims, beta, varcov)
#
# # sanity check
# dim(simbetas)
#
# # # extract betas for each group (1=part time; 2=full-time)
# # simbetas_pt <- simbetas[,(1:nbetas)*2-1]
# # simbetas_ft <- simbetas[,(1:nbetas)*2]
# simbetas_pt <- as.matrix(simbetas[,(1:nbetas)*2-1])
# simbetas_ft <- as.matrix(simbetas[,(1:nbetas)*2])
#
# # simbetas
# head(simbetas_pt)
# head(simbetas_ft)
# pt_names <- colnames(simbetas_pt)
# colnames(simbetas_pt) <- c("intercept", "log_inc", "rec_year", "log_inc_x_year", pt_names[5:length(pt_names)])
# ft_names <- colnames(simbetas_ft)
# colnames(simbetas_ft) <- c("intercept", "log_inc", "rec_year", "log_inc_x_year", ft_names[5:length(ft_names)])
#
# # simb <- array(NA, dim = c(sims,nbetas,2))   # 3d array: row(one per sim draw), column(one per param), sheet(one per category except the base = not-enrolled)
# # simb[,,1] <- simbetas[,(1:nbetas)*2-1]      # part-time
# # simb[,,2] <- simbetas[,(1:nbetas)*2]        # full-time
# #
# # # sanity check
# # simb[1,2,1] # first element of the simbeta draws of LOG_INC for category "academic"
# # simb[1,2,2] # first element of the simbeta draws of LOG_INC for "vocational"
# # colnames(simbetas_pt) # if you want to check colnames
#
# #####
#
# # setting the explanatory variables.
# # my apologies for hard-coding this, I couldn't find a way to extract this from any given model.
#
# setx <- matrix(nrow = 1, ncol=nbetas)
# names(setx) <- colnames(simbetas_pt)
# setx[1] <- 1        # intercept ### 1
#
# # race/gender covariates
# setx[5:13] <- 0  # for simplicity, we'll look at white male
#
# # calculate predicted probabilities
#
# # part-time
# pred.part <- exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft)))
# # full-time
# pred.full <- exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft)))
# # not-enrolled
# pred.not <- 1-pred.part-pred.full
#
# pred.results <- cbind(t(pred.not),t(pred.part),t(pred.full))
#
# mean(pred.part)
# mean(pred.full)
# mean(pred.not)
#
# # simulate for recently independent over years of interest
# range = c(2001:2013)
#
# pred_ft_low = NULL
# pred_pt_low = NULL
# pred_no_low = NULL
#
# for (i in 0:(length(range)-1)){
#   setx[2] <- 10       # Setting to Low Income ($22,026 USD)
#   setx["rec_year"] <- i
#   setx["log_inc_x_year"] <- i*10
#   pred_pt_low[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
#   pred_ft_low[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
#   pred_no_low[1+i] <- 1 - (pred_pt_low[1+i] + pred_ft_low[1+i])
# }
#
#
# pred_ft_mid = NULL
# pred_pt_mid = NULL
# pred_no_mid = NULL
#
#
# for (i in 0:(length(range)-1)){
#   setx[2] <- 11       # Setting to mid Income ($162,755 USD)
#   setx["rec_year"] <- i
#   setx["log_inc_x_year"] <- i*12
#   pred_pt_mid[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
#   pred_ft_mid[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
#   pred_no_mid[1+i] <- 1 - (pred_pt_mid[1+i] + pred_ft_mid[1+i])
# }
#
# pred_ft_high = NULL
# pred_pt_high = NULL
# pred_no_high = NULL
#
# for (i in 0:(length(range)-1)){
#   setx[2] <- 12       # Setting to high Income ($59,874 USD)
#   setx["rec_year"] <- i
#   setx["log_inc_x_year"] <- i*12
#   pred_pt_high[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
#   pred_ft_high[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
#   pred_no_high[1+i] <- 1 - (pred_pt_high[1+i] + pred_ft_high[1+i])
# }
#
# plot(pred_pt_high)
# plot(pred_ft_high)
# plot(pred_no_high)
#
# # overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
# pdf("figure_pt.pdf")
# plot(c(min(range) , max(range)), c(min(0, 0), max(c(.1, .11))), type = "n",
#      xlab = "Year", ylab = "Probability",
#      main = "Probability of Part-Time Enrollment by Income (Table 1 Model)")
# lines(range, pred_pt_low, type = "o", col="blue")
# lines(range, pred_pt_mid, type = "o", col="green")
# lines(range, pred_pt_high, type = "o", col="red")
# legend(2001,.02, c("Low-Income ($22,026)", "Middle-Income ($59,874)", "High-Income ($162,755)"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
# dev.off()
#
# # overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
# pdf("figure_ft.pdf")
# plot(c(min(range) , max(range)), c(min(pred_ft_low, pred_ft_high), max(c(pred_ft_low, pred_ft_high))*1.2), type = "n",
#      xlab = "Year)", ylab = "Probability",
#      main = "Probability of Full-Time Enrollment by Income (Table 1 Model)")
# lines(range, pred_ft_low, type = "o", col="blue")
# lines(range, pred_ft_mid, type = "o", col="green")
# lines(range, pred_ft_high,type = "o", col="red")
# legend(2001,.8, c("Low-Income ($22,026)", "Middle-Income ($59,874)", "High-Income ($162,755)"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
# dev.off()
#
# # overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
# pdf("figure_no_enroll.pdf")
# plot(c(min(range) , max(range)), c(min(pred_no_high, pred_no_low), max(c(pred_no_high, pred_no_low))*1.2), type = "n",
#      xlab = "Year", ylab = "Probability",
#      main = "Probability of Not Being Enrolled in College (Table 1 Model)")
# lines(range, pred_no_low, type = "o", col="blue")
# lines(range, pred_no_mid, type = "o", col="green")
# lines(range, pred_no_high, type = "o", col="red")
# legend(2006,.90, c("Low-Income ($22,026)", "Middle-Income ($59,874)", "High-Income ($162,755)"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
# dev.off()
#
# # overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
# pdf("figure_double.pdf")
# plot(c(min(range) , max(range)), c(min(0,0), max(c(1,1))), type = "n",
#      xlab = "Year", ylab = "Probability",
#      main = "Full-Time Enrollment vs Not Being Enrolled (Table 1 Model)")
# lines(range, pred_no_low, lty=2, col="blue")
# lines(range, pred_no_high, lty=2, col="red")
# lines(range, pred_ft_low, type = "o", col="blue")
# lines(range, pred_ft_high, type = "o", col="red")
# legend(2008,.20, c("Low-$, Not Enrolled", "Low-$, FT Enrolled", "High-$, Not Enrolled", "High-$, Enrolled"), lty=c(2,1,2,1), lwd=c(2.5, 2.5, 2.5, 2.5), col=c("blue", "blue", "red", "red"))
# dev.off()


##################################################
# 5-2) simulation using year-dummy model
#      model: (4) -- line 155
##################################################

# checking its coef and standard error
beta <- coef(model_linc2)               # coefficients (betas)
nbetas <- length(beta)/2                # number of betas
varcov <- -solve(hessian(model_linc2))  # variance-covariance matrix
se <- sqrt(diag(varcov))                # standard error

# simulation
set.seed(02138)
sims <- 100000
simbetas <- mvrnorm(sims, beta, varcov)

# sanity check
# dim(simbetas) # 24*2 betas

# # extract betas for each group (1=part time; 2=full-time)
simbetas_pt <- as.matrix(simbetas[,(1:nbetas)*2-1])
simbetas_ft <- as.matrix(simbetas[,(1:nbetas)*2])

# setting the hypothetical covariates.
setx <- matrix(nrow = 1, ncol=nbetas)
names(setx) <- colnames(simbetas_pt)
setx[] <- 0

# calculate predicted probabilities
# simulate for recently independent over years of interest
range = c(2001:2013)

pred_ft_low = NULL
pred_pt_low = NULL
pred_no_low = NULL

for (i in 0:(length(range)-1)){
  setx[] <- 0
  setx[1] <- 1        # intercept ### 1
  setx[2] <- 10       # Setting to Low Income ($22,026 USD)
  setx[3] <- i        # rec_year: i (2001 = 0, 2002 = 1 etc.)
  setx[4] <- 0        # gender: male
  setx[5:12] <- 0     # for simplicity, we'll look at white male
  setx[12+i] <- (i/i)*10
  setx[is.nan(setx)] <- 0
  pred_pt_low[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_ft_low[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_no_low[1+i] <- 1 - (pred_pt_low[1+i] + pred_ft_low[1+i])
}

pred_ft_mid = NULL
pred_pt_mid = NULL
pred_no_mid = NULL

for (i in 0:(length(range)-1)){
  setx[] <- 0
  setx[1] <- 1        # intercept ### 1
  setx[2] <- 11       # Setting to mid income
  setx[3] <- i        # rec_year: i (2001 = 0, 2002 = 1 etc.)
  setx[4] <- 0        # gender: male
  setx[5:12] <- 0     # for simplicity, we'll look at white male
  setx[12+i] <- (i/i)*11
  setx[is.nan(setx)] <- 0
  pred_pt_mid[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_ft_mid[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_no_mid[1+i] <- 1 - (pred_pt_mid[1+i] + pred_ft_mid[1+i])
}

pred_ft_high = NULL
pred_pt_high = NULL
pred_no_high = NULL

for (i in 0:(length(range)-1)){
  setx[] <- 0
  setx[1] <- 1        # intercept ### 1
  setx[2] <- 12       # Setting to high income
  setx[3] <- i        # rec_year: i (2001 = 0, 2002 = 1 etc.)
  setx[4] <- 0        # gender: male
  setx[5:12] <- 0     # for simplicity, we'll look at white male
  setx[12+i] <- (i/i)*12
  setx[is.nan(setx)] <- 0
  pred_pt_high[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_ft_high[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_no_high[1+i] <- 1 - (pred_pt_high[1+i] + pred_ft_high[1+i])
}

plot(pred_pt_high)
plot(pred_ft_high)
plot(pred_no_high)

# overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
pdf("figure_pt.pdf")
plot(c(min(range) , max(range)), c(min(0, 0), max(c(.1, .11))), type = "n",
     xlab = "Year", ylab = "Probability",
     main = "Probability of Part-Time Enrollment by Income")
lines(range, pred_pt_low, type = "o", col="blue")
lines(range, pred_pt_mid, type = "o", col="green")
lines(range, pred_pt_high, type = "o", col="red")
legend(2001,.02, c("Low-Income ($22,026)", "Middle-Income ($59,874 USD)", "High-Income ($162,755)"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
dev.off()

# overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
pdf("figure_ft.pdf")
plot(c(min(range) , max(range)), c(min(pred_ft_low, pred_ft_high), max(c(pred_ft_low, pred_ft_high))*1.2), type = "n",
     xlab = "Year)", ylab = "Probability",
     main = "Probability of Full-Time Enrollment by Income")
lines(range, pred_ft_low, type = "o", col="blue")
lines(range, pred_ft_mid, type = "o", col="green")
lines(range, pred_ft_high,type = "o", col="red")
legend(2001,.8, c("Low-Income", "Middle-Income", "High-Income"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
dev.off()

# overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
pdf("figure_no_enroll.pdf")
plot(c(min(range) , max(range)), c(min(pred_no_high, pred_no_low), max(c(pred_no_high, pred_no_low))*1.2), type = "n",
     xlab = "Year", ylab = "Probability",
     main = "Probability of Not Being Enrolled in College")
lines(range, pred_no_low, type = "o", col="blue")
lines(range, pred_no_mid, type = "o", col="green")
lines(range, pred_no_high, type = "o", col="red")
legend(2009,.85, c("Low-Income", "Middle-Income", "High-Income"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
dev.off()

# overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
pdf("figure_double.pdf")
plot(c(min(range) , max(range)), c(min(0,0), max(c(1,1))), type = "n",
     xlab = "Year", ylab = "Probability",
     main = "Full-Time Enrollment vs Not Being Enrolled")
lines(range, pred_no_low, lty=2, col="blue")
lines(range, pred_no_high, lty=2, col="red")
lines(range, pred_ft_low, type = "o", col="blue")
lines(range, pred_ft_high, type = "o", col="red")
legend(2008,.20, c("Low-$, Not Enrolled", "Low-$, FT Enrolled", "High-$, Not Enrolled", "High-$, Enrolled"), lty=c(2,1,2,1), lwd=c(2.5, 2.5, 2.5, 2.5), col=c("blue", "blue", "red", "red"))
dev.off()



##################################################
# 6) robustness checks
#   model: (4), but with log_income z scores
##################################################

# log_income z scores
data1$z_log_inc <- scale(data1$log_inc, center = TRUE, scale = TRUE)
dd <- mlogit.data(data1, shape = "wide", choice = "multi_coll")

model_z1 <- mlogit(formula = multi_coll ~ 1|z_log_inc + rec_year + z_log_inc:factor(year) + factor(sex) + factor(race) | 0, weights = wtsupp,data=dd, method = "nr", print.level = 0)
# summary(model_z1)

# checking its coef and standard error
names(model_z1)
beta <- coef(model_z1)              # coefficients (betas)
nbetas <- length(beta)/2            # number of betas
varcov <- -solve(hessian(model_z1)) # variance-covariance matrix
se <- sqrt(diag(varcov))            # standard error

# simulation
set.seed(02138)
sims <- 10000
simbetas <- mvrnorm(sims, beta, varcov)

# sanity check
# dim(simbetas)

# extract betas for each group (1=part time; 2=full-time)
simbetas_pt <- as.matrix(simbetas[,(1:nbetas)*2-1])
simbetas_ft <- as.matrix(simbetas[,(1:nbetas)*2])

# setting the hypothetical covariates
setx <- matrix(nrow = 1, ncol=nbetas)
names(setx) <- colnames(simbetas_pt)
setx[1] <- 1     # intercept = 1

# race/gender covariates
setx[4:12] <- 0  # for simplicity, we'll look at white male

# simulate for recently independent over years of interest
range = c(2001:2013)

pred_ft_low = NULL
pred_pt_low = NULL
pred_no_low = NULL

for (i in 0:(length(range)-1)){
  setx[2] <- mean(data1[data1[,"log_inc"]<=10.001&data1[,"log_inc"]>=9.999,"z_log_inc"])       # Setting to Low Income ($22,026 USD)
  setx[3] <- i
  setx[13:24] <- 0 # clear all z_log_inc:factor(year)
  if(i>0){
    setx[12+i] <- setx[2]
  }
  pred_pt_low[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_ft_low[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_no_low[1+i] <- 1 - (pred_pt_low[1+i] + pred_ft_low[1+i])
}

pred_ft_mid = NULL
pred_pt_mid = NULL
pred_no_mid = NULL

for (i in 0:(length(range)-1)){
  setx[2] <-  mean(data1[data1[,"log_inc"]<=11.001&data1[,"log_inc"]>=10.999,"z_log_inc"])      # Setting to mid Income ($59,874 USD)
  setx[3] <- i
  setx[13:24] <- 0 # clear all z_log_inc:factor(year)
  if(i>0){
    setx[12+i] <- setx[2]
  }
  pred_pt_mid[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_ft_mid[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_no_mid[1+i] <- 1 - (pred_pt_mid[1+i] + pred_ft_mid[1+i])
}

pred_ft_high = NULL
pred_pt_high = NULL
pred_no_high = NULL

for (i in 0:(length(range)-1)){
  setx[2] <- mean(data1[data1[,"log_inc"]<=12.001&data1[,"log_inc"]>=11.999,"z_log_inc"])       # Setting to high Income ($162,755 USD)
  setx[3] <- i
  setx[13:24] <- 0 # clear all z_log_inc:factor(year)
  if(i>0){
    setx[12+i] <- setx[2]
  }
  pred_pt_high[1+i] <- mean(exp(setx%*%t(simbetas_pt))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_ft_high[1+i] <- mean(exp(setx%*%t(simbetas_ft))/(1 + exp(setx%*%t(simbetas_pt))+exp(setx%*%t(simbetas_ft))))
  pred_no_high[1+i] <- 1 - (pred_pt_high[1+i] + pred_ft_high[1+i])
}

# overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
pdf("figure_pt_zscore.pdf")
plot(c(min(range) , max(range)), c(min(0, 0), max(c(.1, .11))), type = "n",
     xlab = "Year", ylab = "Probability",
     main = "Probability of Part-Time Enrollment by Income")
lines(range, pred_pt_low, type = "o", col="blue")
lines(range, pred_pt_mid, type = "o", col="green")
lines(range, pred_pt_high, type = "o", col="red")
legend(2001,.02, c("Low-Income ($22,026)", "Middle-Income ($59,874 USD)", "High-Income ($162,755)"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
dev.off()

# overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
pdf("figure_ft_zscore.pdf")
plot(c(min(range) , max(range)), c(min(pred_ft_low, pred_ft_high), max(c(pred_ft_low, pred_ft_high))*1.2), type = "n",
     xlab = "Year)", ylab = "Probability",
     main = "Probability of Full-Time Enrollment by Income")
lines(range, pred_ft_low, type = "o", col="blue")
lines(range, pred_ft_mid, type = "o", col="green")
lines(range, pred_ft_high,type = "o", col="red")
legend(2001,.8, c("Low-Income", "Middle-Income", "High-Income"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
dev.off()

# overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
pdf("figure_no_enroll_zscore.pdf")
plot(c(min(range) , max(range)), c(min(pred_no_high, pred_no_low), max(c(pred_no_high, pred_no_low))*1.2), type = "n",
     xlab = "Year", ylab = "Probability",
     main = "Probability of Not Being Enrolled in College")
lines(range, pred_no_low, type = "o", col="blue")
lines(range, pred_no_mid, type = "o", col="green")
lines(range, pred_no_high, type = "o", col="red")
legend(2009,.85, c("Low-Income", "Middle-Income", "High-Income"), lty=c(1,1,1), lwd=c(2.5, 2.5, 2.5), col=c("blue", "green", "red"))
dev.off()

# overlay
# setwd("/Users/johnhansen/Dropbox/R/gov_2001/project/output")
pdf("figure_double_zscore.pdf")
plot(c(min(range) , max(range)), c(min(0,0), max(c(1,1))), type = "n",
     xlab = "Year", ylab = "Probability",
     main = "Full-Time Enrollment vs Not Being Enrolled")
lines(range, pred_no_low, lty=2, col="blue")
lines(range, pred_no_high, lty=2, col="red")
lines(range, pred_ft_low, type = "o", col="blue")
lines(range, pred_ft_high, type = "o", col="red")
legend(2008,.20, c("Low-$, Not Enrolled", "Low-$, FT Enrolled", "High-$, Not Enrolled", "High-$, Enrolled"), lty=c(2,1,2,1), lwd=c(2.5, 2.5, 2.5, 2.5), col=c("blue", "blue", "red", "red"))
dev.off()